First, let’s get the different libraries we’re going to use.
knitr::opts_chunk$set(echo = TRUE)
library(sigfit)
library(ggplot2)
library(data.table)
library(dplyr)
sigfit is a bit annoying and requires specific bullshit to run (like a specific order for the nucleotide contexts and a particular arrangement of the matrix so)
secondceph <- read.csv('/Users/quinlan/Documents/Quinlan-PhD/SpermSeq/spermseq/Ceph2ndGenTriCollapsedPaternal.csv', header = FALSE, sep = ' ')
# these are the mutation counts for paternal DNNMs in the second generation, collapsed to 96 trinucleotide contexts that match sigfit's requirements
#thirdceph <- read.csv('/Users/quinlan/Documents/Quinlan-PhD/SpermSeq/spermseq/Ceph3rdGenTriCollapsedPaternal.csv', header = FALSE, sep = ' ')
# here is the third generation if desired for comparable analysis
data("cosmic_signatures_v3.2")
# here is what we're going to base our sigfit order stuff off of
cosmicswitched <- as.data.frame(t(as.data.frame(cosmic_signatures_v3.2)))
#let's get it into a dataframe for the next part with switched rows/columns
cosmicswitched <- setDT(cosmicswitched, keep.rownames = TRUE)[]
#I want the row names to be a row to match by
cosmicswitched <- cosmicswitched[,1:2]
# let's clean it up
cosmicswitched$id <- 1:nrow(cosmicswitched)
# I need to code in the order for later
colnames(secondceph)[2] <- "rn"
# matching column names to merge
merged <- merge(cosmicswitched,secondceph, by = "rn")
merged <- merged[order(merged$id), ]
#I've merged and ordered by the order column, so stuff is where it should be
secondceph <- merged[, c("V1", "rn")]
# I update my secondceph df to be in the right order
### now for some final touches
secondcephswitched <- t(as.data.frame(secondceph[,1]))
secondcephswitched<- secondcephswitched/rowSums(secondcephswitched)
colnames(secondcephswitched) <- secondceph$rn
# this should now be in the right row vs column matrix, with proportions, named,
# and in the right order
This one isn’t so bad.
sperm_count <- read.csv('/Users/quinlan/Documents/Quinlan-PhD/SpermSeq/spermseq/SpermMutationCountMatrix.csv')
spermswitched <- t(as.data.frame(sperm_count[-1]))
colnames(spermswitched) <- colnames(cosmic_signatures_v3.2)
#we can take a look at the sasani data we're putting in as a signature like this
par(mar=c(5,6,6.5,1))
plot_spectrum(secondcephswitched)
## Okay, let’s do the actual analysis
mcmc_samples_fit <- fit_extract_signatures(
counts=spermswitched, #mutation counts from our data set
signatures=secondcephswitched, #our signature we're putting in from Sasani
num_extra_sigs = 1, #getting one additional signature
iter=50000,
warmup=25000,
chains=1,
# doesn't recommend multiple chains for this
seed=1896)
## ---
## Fit-Ext: Fitting 1 signatures and extracting 1 signature(s) using multinomial model
## ---
## Stan sampling:
## SAMPLING FOR MODEL 'sigfit_fitext' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.000611 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 6.11 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 50000 [ 0%] (Warmup)
## Chain 1: Iteration: 5000 / 50000 [ 10%] (Warmup)
## Chain 1: Iteration: 10000 / 50000 [ 20%] (Warmup)
## Chain 1: Iteration: 15000 / 50000 [ 30%] (Warmup)
## Chain 1: Iteration: 20000 / 50000 [ 40%] (Warmup)
## Chain 1: Iteration: 25000 / 50000 [ 50%] (Warmup)
## Chain 1: Iteration: 25001 / 50000 [ 50%] (Sampling)
## Chain 1: Iteration: 30000 / 50000 [ 60%] (Sampling)
## Chain 1: Iteration: 35000 / 50000 [ 70%] (Sampling)
## Chain 1: Iteration: 40000 / 50000 [ 80%] (Sampling)
## Chain 1: Iteration: 45000 / 50000 [ 90%] (Sampling)
## Chain 1: Iteration: 50000 / 50000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 155.619 seconds (Warm-up)
## Chain 1: 118.528 seconds (Sampling)
## Chain 1: 274.147 seconds (Total)
## Chain 1:
par(mar = c(5,6,7,2))
extr_sigs <- retrieve_pars(mcmc_samples_fit, "signatures")
plot_spectrum(extr_sigs)
par(mar=c(8,5,3.5,0))
plot_exposures(mcmc_samples = mcmc_samples_fit)